import os
import re
import html as ihtml
import pandas as pd
import emoji
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
df = pd.read_csv('data.csv')
tweet=df['text']
len(df)
totallen=len(df)
#droping na records
print("Removing Na records")
df = df.dropna()
df = df.reset_index(drop=True)
print("NA records removed: ",totallen-len(df) )
#droping dulplicats records
print("Removing Duplicates records")
df = df.drop_duplicates()
#reset index after dropping
df = df.reset_index(drop=True)
print("duplicates records removed: ",totallen-len(df) )
def preprocess_tweet(row):
text = row['text']
text=emoji.demojize(text)
return text
df['text']=df.apply(preprocess_tweet,axis=1)
df['text']=df['text'].str.replace(":","")
df.head()
df['hashtags']=df.text.str.findall(r'#.*?(?=\s|$)')
df.head()
df['mention']=df.text.str.findall(r'(?:(?<=\s)|(?<=^))@.*?(?=\s|$)')
df.head()
from bs4 import BeautifulSoup
def beautify(row):
text = row['text']
text = BeautifulSoup(ihtml.unescape(text), "lxml").text
text = re.sub(r"http[s]?://\S+", "", text)
text = re.sub(r"\s+", " ", text)
return text
df['text']=df.apply(beautify,axis=1)
df['text']=df['text'].str.replace(":","")
df.head()
df['text'] = df['text'].replace(re.compile(r'@[A-Z0-9a-z_:]+'),' ')#replace username-tags
df['text'] = df['text'].replace(re.compile(r'^[RT]+'),' ')#replace RT-tags
df['text'] = df['text'].replace(re.compile("[^a-zA-Z]"), " ")#replace hashtags
df.head()
df['text'][0]
print("Removing links")
#removes links and urls
df['text'] = df['text'].replace(re.compile(r'((www\.[\S]+)|(https?://[\S]+))'),"")
df['text'] = df['text'].replace(re.compile(r'((\w+\/\/\S+))'),"")
print("Links are removed")
df['text']=df['text'].str.replace("_"," ")
df['text'][0]
print("Removing punchuation and special characters")
#removes puntuation
df['text'] = df['text'].str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
print("Puntuations removed...")
print("converting string small case")
#convert text to small case
df['text'] = df['text'].str.lower()
df['text'][0]
print("Removing single character words")
#remove single character chracter
df['text'] = df['text'].replace(re.compile(r"(^| ).( |$)"), " ")
print("Removing single character words")
df['text'][0]
stop = stopwords.words('english')
#english words
english_word = set(nltk.corpus.words.words())
print("Removing stop words...")
#remove stop words
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
print("Stop words removed...")
df['text'][0]
print("Removing non english words")
#remove non english words
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word in (english_word)]))
print("Non english words removed")
df['text'][0]
df['text']=df['text'].str.strip()
df['text'] = df['text'].replace(re.compile(r"(^| ).( |$)"), " ")
df['text'][0]
print("Removing tweets having words less than 1 words")
#drops tweets less than 5 words
df.drop(df[df['text'].str.count(" ") < 1].index , inplace=True)
#reset index after dropping
df = df.reset_index(drop=True)
print("tweets having words less than 1 words are removed...")
print("word count less than 1 records removed: ",totallen-len(df) )
print("new data started writting in new csv file preprocessed_data.csv...")
#write clean data to new file
df.to_csv('preprocessed_data.csv', index=False, encoding="utf-8")
print("clean data is written on preprocessed_data.csv")
print ("total records",len(df))
import nltk
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, SpatialDropout1D
# nltk.download()
from nltk.tokenize import word_tokenize
data = pd.read_csv("preprocessed_data_train.csv")
data['emotion'] = data['emotion'].str.replace('Disgust','Shame')
data['emotion'] = data['emotion'].str.replace('Guilt','Shame')
data['emotion'] = data['emotion'].str.replace('Happiness','Happy')
data['emotion'] = data['emotion'].str.replace('Scared','Fear')
data['emotion'] = data['emotion'].str.replace('sadness','Sad')
data['emotion'] = data['emotion'].str.replace('anger','Angry')
data['emotion'] = data['emotion'].str.replace('Mad','Angry')
data['emotion'] = data['emotion'].str.replace('surprise','Surprise')
#data.drop(data.index[data['sentiment'] == "sentiment"], inplace = True)
#data.drop(data.index[data['emotion'] == "Powerful"], inplace = True)
#data.drop(data.index[data['emotion'] == "Peaceful"], inplace = True)
data.emotion.value_counts()
# tokenization
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, maxlen=32)
print(X.shape[1])
enbedding_out_dim = 256
lstm_out_dim = 256
model = Sequential()
model.add(Embedding(max_words, enbedding_out_dim,input_length = X.shape[1]))
model.add(LSTM(lstm_out_dim+1))
model.add(Dense(8,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
# data set to train
dummies = pd.get_dummies(data['emotion'])
Y = dummies.values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 50)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
dict_emotion = {}
dict_label = {}
for i in range(len(Y)):
dict_emotion[data['emotion'][i]] = np.argmax(Y[i])
dict_label[np.argmax(Y[i])] = data['emotion'][i]
if len(dict_emotion) == 8:
print('Break at: ', i)
break
print(dict_emotion, dict_label)
X_val = X_train[:500]
Y_val = Y_train[:500]
partial_X_train = X_train[500:]
partial_Y_train = Y_train[500:]
# train the net
batch_size = 512
history = model.fit(X_train,Y_train,
epochs = 50,
batch_size=batch_size,
validation_data=(X_val, Y_val))
import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
import matplotlib.pyplot as plt
loss = history.history['accuracy']
val_loss = history.history['val_accuracy']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'r', label='Training Accuracy')
plt.plot(epochs, val_loss, 'b', label='Validation Accuracy')
plt.title('Training and validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
# validation
total, correct, false = 0, 0, 0
# print(len(X_val))
for x in range(len(X_val)):
total += 1
# print(x)
result = model.predict(X_val[x].reshape(1, X_test.shape[1]), batch_size=1)[0]
# print(np.argmax(result), np.argmax(Y_val[x]))
if np.argmax(result) == np.argmax(Y_val[x]):
correct += 1
else:
false += 1
print("accuracy", correct / total * 100, "%")
# print("negative accuracy", neg_correct / negative_count * 100, "%")
data=pd.read_csv('preprocessed_data.csv')
data['text']=data['text'].str.strip()
data = data.dropna()
data = data.reset_index(drop=True)
data.drop(data[data['text'].str.count(" ") < 1].index , inplace=True)
data.reset_index(drop=True)
# tokenization
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, maxlen=32)
print(X.shape[1])
#prediction on given data
em=[]
for i in range(len(X)):
result = model.predict(X[i].reshape(1,X_test.shape[1]), batch_size=1)[0]
emotion_value = np.argmax(result)
emotion = dict_label[emotion_value]
em.append(emotion)
se = pd.Series(em)
data['emotion']=se.values
data['emotion']
ls=list(set(data['emotion']))
ls
for i in ls:
em_type=data[data['emotion']==i]
print(i,"Percentage: ",(len(em_type)/len(data))*100)
print("new data started writting in new csv file preprocessed_data.csv...")
#write clean data to new file
data.to_csv('predicted_new.csv', index=False, encoding="utf-8")
print("clean data is written on preprocessed_data.csv")
print ("total records",len(data))
import pandas as pd
import numpy as np
import csv
import re #regular expression
from textblob import TextBlob
import string
import preprocessor as p
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from PIL import Image
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import datetime
import calendar
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import itertools
import collections
from collections import Counter
from palettable.colorbrewer.qualitative import Pastel1_7
import matplotlib.cbook as cbook
#plots
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
df = pd.read_csv('predicted_new.csv')
country_code=pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
c_name=list(country_code.COUNTRY)
lc_name=country_code.COUNTRY.str.lower()
df = pd.read_csv('predicted_new.csv')
df['location'] = df['location'].str.lower()
df.loc[df['location'].str.contains('india'), 'location'] = 'India'
df.loc[df['location'].str.contains('In'), 'location'] = 'India'
df.loc[df['location'].str.contains('IN'), 'location'] = 'India'
df.loc[df['location'].str.contains('in'), 'location'] = 'India'
df.loc[df['location'].str.contains('UP'), 'location'] = 'India'
for i in range(len(lc_name)):
df.loc[df['location'].str.contains(lc_name[i]), 'location'] = c_name[i]
cs=set(country_code.COUNTRY)
mcs=set(df.location)
#print(len(set(with_country_name.location)))
with_country_name=df[df['location'].isin(list(country_code['COUNTRY']))]
with_country_name.location
with_country_name['count']= with_country_name.location.map(with_country_name.location.value_counts())
all_tweet_location=pd.merge(with_country_name,country_code,left_on="location",right_on="COUNTRY",how="left")
unique_count=all_tweet_location[['location','count','CODE']]
#with_country_name.count
unique_count =unique_count.drop_duplicates()
#reset index after dropping
unique_count = unique_count.reset_index(drop=True)
unique_count =unique_count.nlargest(10,['count'])
total=df['emotion'].count()
df_happy=df[df['emotion']=='Happy'].count()
df_sad=df[df['emotion']=='Sad'].count()
df_angry=df[df['emotion']=='Angry'].count()
df_shame=df[df['emotion']=='Shame'].count()
df_fear=df[df['emotion']=='Fear'].count()
df_surprise=df[df['emotion']=='Surprise'].count()
df.head()
country_code.head()
fig = go.Figure(go.Bar(
x=unique_count['location'][0:],y=unique_count['count'],
marker={'color': unique_count['count'][:10],
'colorscale': 'blues'},
text=unique_count['count'][:10],
textposition = "outside",
))
fig.update_layout(title_text='Top Countries with most tweets',xaxis_title="Countries",
yaxis_title="Number of Tweets",template="plotly_dark",height=700,title_x=0.5)
fig.show()
df_happy=all_tweet_location[all_tweet_location['emotion']=='Happy']
df_sad=all_tweet_location[all_tweet_location['emotion']=='Sad']
df_angry=all_tweet_location[all_tweet_location['emotion']=='Angry']
df_shame=all_tweet_location[all_tweet_location['emotion']=='Shame']
df_fear=all_tweet_location[all_tweet_location['emotion']=='Fear']
fig = go.Figure(data=go.Choropleth(
locations = all_tweet_location['CODE'],
z = all_tweet_location['count'],
text = all_tweet_location['location'],
colorscale = 'rainbow',
autocolorscale=False,
reversescale=False,
marker_line_color='darkgray',
marker_line_width=0.5,
colorbar_title = '# of Tweets',
))
fig.update_layout(
# title_text='Tweets over the world - ({} - {})'.format(df['date'].sort_values()[0].strptime("%d/%m/%Y"),
# df['date'].sort_values().iloc[-1].strptime("%d/%m/%Y")),title_x=0.5,
title_text='Tweets over the world - ({} - {})'.format(all_tweet_location.date.min(),all_tweet_location.date.max()),title_x=0.5,
geo=dict(
showframe=True,
showcoastlines=False,
projection_type='equirectangular',
)
)
fig.show()
df['text_length']=df['text'].str.split(" ").str.len()
print("Average length of Happy Emotion Sentiment tweets : {}".format(round(df[df['emotion']=='Happy']['text_length'].mean(),2)))
print("Average length of Sad Emotion Sentiment tweets : {}".format(round(df[df['emotion']=='Sad']['text_length'].mean(),2)))
print("Average length of Angry Emotion Sentiment tweets : {}".format(round(df[df['emotion']=='Angry']['text_length'].mean(),2)))
print("Average length of Shame Emotion Sentiment tweets : {}".format(round(df[df['emotion']=='Shame']['text_length'].mean(),2)))
print("Average length of Fear Emotion Sentiment tweets : {}".format(round(df[df['emotion']=='Fear']['text_length'].mean(),2)))
df.text=df.text.str.strip()
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def ngram_df(corpus,nrange,n=None):
vec = CountVectorizer(stop_words = 'english',ngram_range=nrange).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
total_list=words_freq[:n]
df=pd.DataFrame(total_list,columns=['text','count'])
return df
unigram_df=ngram_df(df.text,(1,1),20)
bigram_df=ngram_df(df.text,(2,2),20)
trigram_df=ngram_df(df.text,(3,3),20)
fig = make_subplots(
rows=3, cols=1,subplot_titles=("Unigram over worldwide tweets","Bigram over worldwide tweets",'Trigram over worldwide tweets'),
specs=[[{"type": "scatter"}],
[{"type": "scatter"}],
[{"type": "scatter"}]
])
fig.add_trace(go.Bar(
y=unigram_df['text'][::-1],
x=unigram_df['count'][::-1],
marker={'color': "blue"},
text=unigram_df['count'],
textposition = "outside",
orientation="h",
name="Months",
),row=1,col=1)
fig.add_trace(go.Bar(
y=bigram_df['text'][::-1],
x=bigram_df['count'][::-1],
marker={'color': "blue"},
text=bigram_df['count'],
name="Days",
textposition = "outside",
orientation="h",
),row=2,col=1)
fig.add_trace(go.Bar(
y=trigram_df['text'][::-1],
x=trigram_df['count'][::-1],
marker={'color': "blue"},
text=trigram_df['count'],
name="Days",
orientation="h",
textposition = "outside",
),row=3,col=1)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='Top N Grams',xaxis_title=" ",yaxis_title=" ",
showlegend=False,title_x=0.5,height=1200,template="plotly_dark")
fig.show()
df_in=df[df.location=='India']
unigram_df=ngram_df(df_in.text,(1,1),20)
bigram_df=ngram_df(df_in.text,(2,2),20)
trigram_df=ngram_df(df_in.text,(3,3),20)
fig = make_subplots(
rows=3, cols=1,subplot_titles=("Unigram over India tweets","Bigram over India tweets",'Trigram over India tweets'),
specs=[[{"type": "scatter"}],
[{"type": "scatter"}],
[{"type": "scatter"}]
])
fig.add_trace(go.Bar(
y=unigram_df['text'][::-1],
x=unigram_df['count'][::-1],
marker={'color': "blue"},
text=unigram_df['count'],
textposition = "outside",
orientation="h",
name="Months",
),row=1,col=1)
fig.add_trace(go.Bar(
y=bigram_df['text'][::-1],
x=bigram_df['count'][::-1],
marker={'color': "blue"},
text=bigram_df['count'],
name="Days",
textposition = "outside",
orientation="h",
),row=2,col=1)
fig.add_trace(go.Bar(
y=trigram_df['text'][::-1],
x=trigram_df['count'][::-1],
marker={'color': "blue"},
text=trigram_df['count'],
name="Days",
orientation="h",
textposition = "outside",
),row=3,col=1)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='Top N Grams',xaxis_title=" ",yaxis_title=" ",
showlegend=False,title_x=0.5,height=1200,template="plotly_dark")
fig.show()
fig = go.Figure(data=go.Violin(y=df['text_length'], box_visible=True, line_color='black',
meanline_visible=True, fillcolor='royalblue', opacity=0.6,
x0='Tweet Text Length'))
fig.update_layout(yaxis_zeroline=False,title="Distribution of Text length over worldwide tweets",template='ggplot2')
fig.show()
fig = go.Figure(data=go.Violin(y=df_in['text_length'], box_visible=True, line_color='black',
meanline_visible=True, fillcolor='royalblue', opacity=0.6,
x0='Tweet Text Length'))
fig.update_layout(yaxis_zeroline=False,title="Distribution of Text length over India Tweets",template='ggplot2')
fig.show()
fig = go.Figure()
fig.add_trace(go.Violin(y=df[df['emotion']=='Happy']['text_length'],fillcolor='yellow', opacity=0.6,name="Happy",
x0='Happy')
)
fig.add_trace(go.Violin(y=df[df['emotion']=='Sad']['text_length'], line_color='black',
fillcolor='blue', opacity=0.6,name="Sad",
x0='Sad')
)
fig.add_trace(go.Violin(y=df[df['emotion']=='Angry']['text_length'], line_color='black',
fillcolor='red', opacity=0.6,name="Angry",
x0='Angry')
)
fig.add_trace(go.Violin(y=df[df['emotion']=='Shame']['text_length'], line_color='black',
fillcolor='grey', opacity=0.6,name="Shame",
x0='Shame')
)
fig.add_trace(go.Violin(y=df[df['emotion']=='Fear']['text_length'], line_color='black',
fillcolor='purple', opacity=0.6,name="Fear",
x0='Fear')
)
fig.update_layout(title_text="Violin - Tweet Length over worldwide",title_x=0.5)
fig.show()
df_in=df[df.location=='India']
fig = go.Figure()
fig.add_trace(go.Violin(y=df_in[df_in['emotion']=='Happy']['text_length'],fillcolor='yellow', opacity=0.6,name="Happy",
x0='Happy')
)
fig.add_trace(go.Violin(y=df_in[df_in['emotion']=='Sad']['text_length'], line_color='black',
fillcolor='blue', opacity=0.6,name="Sad",
x0='Sad')
)
fig.add_trace(go.Violin(y=df_in[df_in['emotion']=='Angry']['text_length'], line_color='black',
fillcolor='red', opacity=0.6,name="Angry",
x0='Angry')
)
fig.add_trace(go.Violin(y=df_in[df_in['emotion']=='Shame']['text_length'], line_color='black',
fillcolor='grey', opacity=0.6,name="Shame",
x0='Shame')
)
fig.add_trace(go.Violin(y=df_in[df_in['emotion']=='Fear']['text_length'], line_color='black',
fillcolor='purple', opacity=0.6,name="Fear",
x0='Fear')
)
fig.update_layout(title_text="Violin - Tweet Length over india tweets emotion sentiments",title_x=0.5)
fig.show()
words=df['mention'].tolist()
wd=[]
for i in words:
if len(i)>4:
i=i.replace('[]','')
i=i.replace('[','')
i=i.replace(']','')
i=i.replace('@','')
i=i.replace("'",'')
i=i.replace(' ','')
i=i.replace(' ','')
i=i.strip()
if len(i.split())>0:
wd+=i.split(',')
else:
continue
mention=Counter(wd).keys() # equals to list(set(words))
count=Counter(wd).values() # counts the elements' frequency
d = {'mention': list(mention), 'count': list(count)}
dt = pd.DataFrame(data=d)
dt =dt.nlargest(11,['count'])
x=list(dt['count'])
x.reverse()
y=list(dt['mention'])
y.reverse()
list(dt['count'])
fig = go.Figure(go.Bar(
x=x,
y=y,
orientation='h'))
fig.show()
df_in=df[df['location']=='India']
words= df_in['mention'].tolist()
wd=[]
for i in words:
if len(i)>4:
i=i.replace('[','')
i=i.replace(']','')
i=i.replace('@','')
i=i.replace("'",'')
i=i.replace(' ','')
i=i.replace(' ','')
i=i.strip()
if len(i.split())>0:
wd+=i.split(',')
else:
continue
mention=Counter(wd).keys() # equals to list(set(words))
count=Counter(wd).values() # counts the elements' frequency
d = {'mention': list(mention), 'count': list(count)}
dt = pd.DataFrame(data=d)
dt =dt.nlargest(10,['count'])
x=list(dt['count'])
x.reverse()
y=list(dt['mention'])
y.reverse()
list(dt['count'])
fig = go.Figure(go.Bar(
x=x,
y=y,
orientation='h'))
fig.show()
df_happy=df[df['emotion']=='Happy'].count()
df_sad=df[df['emotion']=='Sad'].count()
df_angry=df[df['emotion']=='Angry'].count()
df_shame=df[df['emotion']=='Shame'].count()
df_fear=df[df['emotion']=='Fear'].count()
df_surprise=df[df['emotion']=='Surprise'].count()
width = 0.50
ind = np.arange(4)
df2 = df.groupby(df["emotion"],as_index=False).count()
plt.style.use('ggplot')
plt.figure(figsize=(10,8))
plt.bar(["Happy","Sad","Angry","Shame","Fear"],[df_happy.text,df_sad.text,df_angry.text,df_shame.text,df_fear.text],width,alpha=0.5 ,color = ["green","blue","red",'grey', 'lightcoral'])
plt.title("Tweet counts of Different Emotion over the worldwide")
plt.ylabel("Frequency")
plt.xlabel("Sentiments")
plt.show()
## plotting the data based on the count of positive , negative and neutral feedback
df_in=df[df['location']=='India']
df_happy=df_in[df_in['emotion']=='Happy'].count()
df_sad=df_in[df_in['emotion']=='Sad'].count()
df_angry=df_in[df_in['emotion']=='Angry'].count()
df_shame=df_in[df_in['emotion']=='Shame'].count()
df_fear=df_in[df_in['emotion']=='Fear'].count()
df_surprise=df[df['emotion']=='Surprise'].count()
width = 0.50
ind = np.arange(4)
df2 = df.groupby(df["emotion"],as_index=False).count()
plt.style.use('ggplot')
plt.figure(figsize=(10,8))
plt.bar(["Happy","Sad","Angry","Shame","Fear"],[df_happy.text,df_sad.text,df_angry.text,df_shame.text,df_fear.text],width,alpha=0.5 ,color = ["green","blue","red",'grey', 'lightcoral'])
plt.title("Tweet counts of Different Emotion over the India")
plt.ylabel("Frequency")
plt.xlabel("Sentiments")
plt.show()
#pie plot
df_happy=df[df['emotion']=='Happy'].count()
df_sad=df[df['emotion']=='Sad'].count()
df_angry=df[df['emotion']=='Angry'].count()
df_shame=df[df['emotion']=='Shame'].count()
df_fear=df[df['emotion']=='Fear'].count()
df_surprise=df[df['emotion']=='Surprise'].count()
labels = 'Happy','Sad', 'Angry', 'Shame', 'Fear'
sizes = [df_happy.text,df_sad.text,df_angry.text,df_shame.text,df_fear.text]
colors = ['green','blue', 'red','grey', 'lightcoral']
df['count']= 1
fig = px.pie(df,title='Emotion percentage Distribution of tweets over worlds ', values='count', names='emotion',labels='emotion')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
df_in=df[df['location']=='India']
df_happy=df_in[df_in['emotion']=='Happy'].count()
df_sad=df_in[df_in['emotion']=='Sad'].count()
df_angry=df_in[df_in['emotion']=='Angry'].count()
df_shame=df_in[df_in['emotion']=='Shame'].count()
df_fear=df_in[df_in['emotion']=='Fear'].count()
df_surprise=df[df['emotion']=='Surprise'].count()
#pie plot
labels = 'Happy','Sad', 'Angry', 'Shame', 'Fear'
sizes = [df_happy.text,df_sad.text,df_angry.text,df_shame.text,df_fear.text]
colors = ['green','blue', 'red','grey', 'lightcoral']
df['count']= 1
fig = px.pie(df_in,title='Emotion percentage Distribution of tweets over India ', values='count', names='emotion',labels='emotion')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
temp = df.groupby('emotion').count()['text'].reset_index().sort_values(by='text',ascending=False)
fig = go.Figure(go.Funnelarea(
text =temp.emotion,
values = temp.text,
title = {"position": "top center", "text": "Funnel-Chart of Emotion Distribution in tweets"}
))
fig.show()
temp = df_in.groupby('emotion').count()['text'].reset_index().sort_values(by='text',ascending=False)
fig = go.Figure(go.Funnelarea(
text =temp.emotion,
values = temp.text,
title = {"position": "top center", "text": "Funnel-Chart of Emotion Distribution in tweets"}
))
fig.show()
df_happy=df[df['emotion']=='Happy']
df_sad=df[df['emotion']=='Sad']
df_angry=df[df['emotion']=='Angry']
df_shame=df[df['emotion']=='Shame']
df_fear=df[df['emotion']=='Fear']
x=list(unique_count.location)
h=[]
s=[]
a=[]
sh=[]
f=[]
for i in unique_count.location:
tempd=all_tweet_location[all_tweet_location['location']==i]
h.append(len(tempd[tempd.emotion=='Happy']))
s.append(len(tempd[tempd.emotion=='Sad']))
a.append(len(tempd[tempd.emotion=='Angry']))
sh.append(len(tempd[tempd.emotion=='Shame']))
f.append(len(tempd[tempd.emotion=='Fear']))
fig = go.Figure(go.Bar(x=x, y=h, name='Happy'))
fig.add_trace(go.Bar(x=x, y=s, name='Sad'))
fig.add_trace(go.Bar(x=x, y=a, name='Angry'))
fig.add_trace(go.Bar(x=x, y=sh, name='Shame'))
fig.add_trace(go.Bar(x=x, y=f, name='Fear'))
fig.update_layout(barmode='stack')
fig.update_xaxes(categoryorder='array', categoryarray= ['d','a','c','b'])
fig.show()
df_in=df[df.location=='India']
df_happy=df_in[df_in['emotion']=='Happy']
df_sad=df_in[df_in['emotion']=='Sad']
df_angry=df_in[df_in['emotion']=='Angry']
df_shame=df_in[df_in['emotion']=='Shame']
df_fear=df_in[df_in['emotion']=='Fear']
x=list(['India'])
h=[]
s=[]
a=[]
sh=[]
f=[]
tempd=all_tweet_location[all_tweet_location['location']=='India']
h.append(len(tempd[tempd.emotion=='Happy']))
s.append(len(tempd[tempd.emotion=='Sad']))
a.append(len(tempd[tempd.emotion=='Angry']))
sh.append(len(tempd[tempd.emotion=='Shame']))
f.append(len(tempd[tempd.emotion=='Fear']))
fig = go.Figure(go.Bar(x=x, y=h, name='Happy'))
fig.add_trace(go.Bar(x=x, y=s, name='Sad'))
fig.add_trace(go.Bar(x=x, y=a, name='Angry'))
fig.add_trace(go.Bar(x=x, y=sh, name='Shame'))
fig.add_trace(go.Bar(x=x, y=f, name='Fear'))
fig.update_layout(barmode='stack')
fig.update_xaxes(categoryorder='array', categoryarray= ['d','a','c','b'])
fig.show()
df_happy=df[df['emotion']=='Happy']
df_sad=df[df['emotion']=='Sad']
df_angry=df[df['emotion']=='Angry']
df_shame=df[df['emotion']=='Shame']
df_fear=df[df['emotion']=='Fear']
df_surprise=df[df['emotion']=='Surprise']
from wordcloud import WordCloud
happy=""
sad=""
angry=""
shame=""
fear=""
for i in df_happy.text:
happy+=i.strip()+" "
for i in df_sad.text:
sad+=i.strip()+" "
for i in df_angry.text:
angry+=i.strip()+" "
for i in df_shame.text:
shame+=i.strip()+" "
for i in df_fear.text:
fear+=i.strip()+" "
#print(s)
happy_split=happy.split()
sad_split=sad.split()
angry_split=angry.split()
shame_split=shame.split()
fear_split=fear.split()
common=set(happy_split) & set(sad_split) & set(angry_split) & set(shame_split) & set(fear_split)
happy_txt = [word for word in happy_split if word not in common]
happy_txt = ' '.join(happy_txt)
sad_txt = [word for word in sad_split if word not in common]
sad_txt = ' '.join(sad_txt)
angry_txt = [word for word in angry_split if word not in common]
angry_txt = ' '.join(angry_txt)
shame_txt = [word for word in shame_split if word not in common]
shame_txt = ' '.join(shame_txt)
fear_txt = [word for word in fear_split if word not in common]
fear_txt = ' '.join(fear_txt)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
def PlotWordCloud(words, title):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white'
).generate(words)
# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title(title, fontsize=50)
plt.show()
PlotWordCloud(happy_txt, 'Most Happy tweet words')
PlotWordCloud(sad_txt, 'Most Sad tweet words')
PlotWordCloud(angry_txt, 'Most Angry tweet words')
PlotWordCloud(shame_txt, 'Most Shame tweet words')
PlotWordCloud(fear_txt, 'Most Fear tweet words')
df=df[df['location']=='India']
df_happy=df[df['emotion']=='Happy']
df_sad=df[df['emotion']=='Sad']
df_angry=df[df['emotion']=='Angry']
df_shame=df[df['emotion']=='Shame']
df_fear=df[df['emotion']=='Fear']
df_surprise=df[df['emotion']=='Surprise']
from wordcloud import WordCloud
happy=""
sad=""
angry=""
shame=""
fear=""
for i in df_happy.text:
happy+=i.strip()+" "
for i in df_sad.text:
sad+=i.strip()+" "
for i in df_angry.text:
angry+=i.strip()+" "
for i in df_shame.text:
shame+=i.strip()+" "
for i in df_fear.text:
fear+=i.strip()+" "
#print(s)
happy_split=happy.split()
sad_split=sad.split()
angry_split=angry.split()
shame_split=shame.split()
fear_split=fear.split()
common=set(happy_split) & set(sad_split) & set(angry_split) & set(shame_split) & set(fear_split)
happy_txt = [word for word in happy_split if word not in common]
happy_txt = ' '.join(happy_txt)
sad_txt = [word for word in sad_split if word not in common]
sad_txt = ' '.join(sad_txt)
angry_txt = [word for word in angry_split if word not in common]
angry_txt = ' '.join(angry_txt)
shame_txt = [word for word in shame_split if word not in common]
shame_txt = ' '.join(shame_txt)
fear_txt = [word for word in fear_split if word not in common]
fear_txt = ' '.join(fear_txt)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
def PlotWordCloud(words, title):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white'
).generate(words)
# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title(title, fontsize=50)
plt.show()
PlotWordCloud(happy_txt, 'Most Happy tweet words')
PlotWordCloud(sad_txt, 'Most Sad tweet words')
PlotWordCloud(angry_txt, 'Most Angry tweet words')
PlotWordCloud(shame_txt, 'Most Shame tweet words')
PlotWordCloud(fear_txt, 'Most Fear tweet words')
df['hashtags'] = df['hashtags'].str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
df_happy=df[df['emotion']=='Happy']
df_sad=df[df['emotion']=='Sad']
df_angry=df[df['emotion']=='Angry']
df_shame=df[df['emotion']=='Shame']
df_fear=df[df['emotion']=='Fear']
df_surprise=df[df['emotion']=='Surprise']
from wordcloud import WordCloud
happy=""
sad=""
angry=""
shame=""
fear=""
for i in df_happy.hashtags:
happy+=i.strip()+" "
for i in df_sad.hashtags:
sad+=i.strip()+" "
for i in df_angry.hashtags:
angry+=i.strip()+" "
for i in df_shame.hashtags:
shame+=i.strip()+" "
for i in df_fear.hashtags:
fear+=i.strip()+" "
#print(s)
happy_split=happy.split()
sad_split=sad.split()
angry_split=angry.split()
shame_split=shame.split()
fear_split=fear.split()
common=set(happy_split) & set(sad_split) & set(angry_split) & set(shame_split) & set(fear_split)
happy_hashtag = [word for word in happy_split if word not in common]
happy_hashtag = ' '.join(happy_hashtag)
sad_hashtag = [word for word in sad_split if word not in common]
sad_hashtag = ' '.join(sad_hashtag)
angry_hashtag = [word for word in angry_split if word not in common]
angry_hashtag = ' '.join(angry_hashtag)
shame_hashtag = [word for word in shame_split if word not in common]
shame_hashtag = ' '.join(shame_hashtag)
fear_hashtag = [word for word in fear_split if word not in common]
fear_hashtag = ' '.join(fear_hashtag)
# lower max_font_size, change the maximum number of word and lighten the background:
def PlotWordCloud(words, title):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white'
).generate(words)
# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title(title, fontsize=50)
plt.show()
PlotWordCloud(happy_hashtag, 'Most Happy hashtags')
PlotWordCloud(sad_hashtag, 'Most Sad hashtags')
PlotWordCloud(angry_hashtag, 'Most Angry hashtags')
PlotWordCloud(shame_hashtag, 'Most shame hashtags')
PlotWordCloud(fear_hashtag, 'Most Fear hashtags')
df['hashtags'] = df['hashtags'].str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
df=df[df['location']=='India']
df_happy=df[df['emotion']=='Happy']
df_sad=df[df['emotion']=='Sad']
df_angry=df[df['emotion']=='Angry']
df_shame=df[df['emotion']=='Shame']
df_fear=df[df['emotion']=='Fear']
df_surprise=df[df['emotion']=='Surprise']
from wordcloud import WordCloud
happy=""
sad=""
angry=""
shame=""
fear=""
for i in df_happy.hashtags:
happy+=i.strip()+" "
for i in df_sad.hashtags:
sad+=i.strip()+" "
for i in df_angry.hashtags:
angry+=i.strip()+" "
for i in df_shame.hashtags:
shame+=i.strip()+" "
for i in df_fear.hashtags:
fear+=i.strip()+" "
#print(s)
happy_split=happy.split()
sad_split=sad.split()
angry_split=angry.split()
shame_split=shame.split()
fear_split=fear.split()
common=set(happy_split) & set(sad_split) & set(angry_split) & set(shame_split) & set(fear_split)
happy_hashtag = [word for word in happy_split if word not in common]
happy_hashtag = ' '.join(happy_hashtag)
sad_hashtag = [word for word in sad_split if word not in common]
sad_hashtag = ' '.join(sad_hashtag)
angry_hashtag = [word for word in angry_split if word not in common]
angry_hashtag = ' '.join(angry_hashtag)
shame_hashtag = [word for word in shame_split if word not in common]
shame_hashtag = ' '.join(shame_hashtag)
fear_hashtag = [word for word in fear_split if word not in common]
fear_hashtag = ' '.join(fear_hashtag)
# lower max_font_size, change the maximum number of word and lighten the background:
def PlotWordCloud(words, title):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white'
).generate(words)
# plot the WordCloud image
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title(title, fontsize=50)
plt.show()
PlotWordCloud(happy_hashtag, 'Most Happy hashtags')
PlotWordCloud(sad_hashtag, 'Most Sad hashtags')
PlotWordCloud(angry_hashtag, 'Most Angry hashtags')
PlotWordCloud(shame_hashtag, 'Most shame hashtags')
PlotWordCloud(fear_hashtag, 'Most Fear hashtags')
tweet=df.text
words_in_happy_tweet = [tweet.lower().split() for tweet in df_happy.text]
words_in_sad_tweet = [tweet.lower().split() for tweet in df_sad.text]
words_in_angry_tweet = [tweet.lower().split() for tweet in df_angry.text]
words_in_shame_tweet = [tweet.lower().split() for tweet in df_shame.text]
words_in_fear_tweet = [tweet.lower().split() for tweet in df_fear.text]
happy_words_no_urls = list(itertools.chain(*words_in_happy_tweet))
sad_words_no_urls = list(itertools.chain(*words_in_sad_tweet))
angry_words_no_urls = list(itertools.chain(*words_in_angry_tweet))
shame_words_no_urls = list(itertools.chain(*words_in_shame_tweet))
fear_words_no_urls = list(itertools.chain(*words_in_fear_tweet))
counts_no_happy = collections.Counter(happy_words_no_urls)
counts_no_sad = collections.Counter(sad_words_no_urls)
counts_no_angry = collections.Counter(angry_words_no_urls)
counts_no_shame = collections.Counter(shame_words_no_urls)
counts_no_fear = collections.Counter(fear_words_no_urls)
clean_tweets_no_happy = pd.DataFrame(counts_no_happy.most_common(15),
columns=['words', 'count'])
clean_tweets_no_sad = pd.DataFrame(counts_no_sad.most_common(15),
columns=['words', 'count'])
clean_tweets_no_angry = pd.DataFrame(counts_no_angry.most_common(15),
columns=['words', 'count'])
clean_tweets_no_shame = pd.DataFrame(counts_no_shame.most_common(15),
columns=['words', 'count'])
clean_tweets_no_fear = pd.DataFrame(counts_no_fear.most_common(15),
columns=['words', 'count'])
fig, ax = plt.subplots(figsize=(12, 8))
# Plot horizontal bar graph
clean_tweets_no_happy.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="green", alpha=0.7)
ax.set_title("Common Words Found in Happy Tweets (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_sad.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="blue", alpha=0.7)
ax.set_title("Common Words Found in Sad (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_angry.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="red", alpha=0.7)
ax.set_title("Common Words Found in Angry Tweets (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_shame.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="grey", alpha=0.7)
ax.set_title("Common Words Found in Shame Tweets (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_fear.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="purple", alpha=0.7)
ax.set_title("Common Words Found in Fear Tweets (Including All Words)")
plt.show()
from collections import Counter
def random_colours(number_of_colors):
'''
Simple function for random colours generation.
Input:
number_of_colors - integer value indicating the number of colours which are going to be generated.
Output:
Color in the following format: ['#E86DA4'] .
'''
colors = []
for i in range(number_of_colors):
colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
return colors
fig = px.treemap(clean_tweets_no_happy.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Happy emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_sad.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Sad emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_angry.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Angry emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_shame.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Shame emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_fear.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Fear emotion tweets')
fig.show()
tweet=df.text[df['location']=='India']
words_in_happy_tweet = [tweet.lower().split() for tweet in df_happy.text]
words_in_sad_tweet = [tweet.lower().split() for tweet in df_sad.text]
words_in_angry_tweet = [tweet.lower().split() for tweet in df_angry.text]
words_in_shame_tweet = [tweet.lower().split() for tweet in df_shame.text]
words_in_fear_tweet = [tweet.lower().split() for tweet in df_fear.text]
happy_words_no_urls = list(itertools.chain(*words_in_happy_tweet))
sad_words_no_urls = list(itertools.chain(*words_in_sad_tweet))
angry_words_no_urls = list(itertools.chain(*words_in_angry_tweet))
shame_words_no_urls = list(itertools.chain(*words_in_shame_tweet))
fear_words_no_urls = list(itertools.chain(*words_in_fear_tweet))
counts_no_happy = collections.Counter(happy_words_no_urls)
counts_no_sad = collections.Counter(sad_words_no_urls)
counts_no_angry = collections.Counter(angry_words_no_urls)
counts_no_shame = collections.Counter(shame_words_no_urls)
counts_no_fear = collections.Counter(fear_words_no_urls)
clean_tweets_no_happy = pd.DataFrame(counts_no_happy.most_common(15),
columns=['words', 'count'])
clean_tweets_no_sad = pd.DataFrame(counts_no_sad.most_common(15),
columns=['words', 'count'])
clean_tweets_no_angry = pd.DataFrame(counts_no_angry.most_common(15),
columns=['words', 'count'])
clean_tweets_no_shame = pd.DataFrame(counts_no_shame.most_common(15),
columns=['words', 'count'])
clean_tweets_no_fear = pd.DataFrame(counts_no_fear.most_common(15),
columns=['words', 'count'])
fig, ax = plt.subplots(figsize=(12, 8))
# Plot horizontal bar graph
clean_tweets_no_happy.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="green", alpha=0.7)
ax.set_title("Common Words Found in Happy Tweets (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_sad.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="blue", alpha=0.7)
ax.set_title("Common Words Found in Sad (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_angry.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="red", alpha=0.7)
ax.set_title("Common Words Found in Angry Tweets (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_shame.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="grey", alpha=0.7)
ax.set_title("Common Words Found in Shame Tweets (Including All Words)")
plt.show()
fig, ax = plt.subplots(figsize=(12, 8))
clean_tweets_no_fear.sort_values(by='count').plot.barh(x='words', y='count', ax=ax, color="purple", alpha=0.7)
ax.set_title("Common Words Found in Fear Tweets (Including All Words)")
plt.show()
from collections import Counter
def random_colours(number_of_colors):
'''
Simple function for random colours generation.
Input:
number_of_colors - integer value indicating the number of colours which are going to be generated.
Output:
Color in the following format: ['#E86DA4'] .
'''
colors = []
for i in range(number_of_colors):
colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
return colors
fig = px.treemap(clean_tweets_no_happy.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Happy emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_sad.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Sad emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_angry.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Angry emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_shame.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Shame emotion tweets')
fig.show()
fig = px.treemap(clean_tweets_no_fear.sort_values(by='count'), path=['words'], values='count',title='Tree of Most Common Words in Fear emotion tweets')
fig.show()
from palettable.colorbrewer.qualitative import Pastel1_7
tweet=df.text
words_in_happy_tweet = [tweet.lower().split() for tweet in df_happy.text]
words_in_sad_tweet = [tweet.lower().split() for tweet in df_sad.text]
words_in_angry_tweet = [tweet.lower().split() for tweet in df_angry.text]
words_in_shame_tweet = [tweet.lower().split() for tweet in df_shame.text]
words_in_fear_tweet = [tweet.lower().split() for tweet in df_fear.text]
happy_words_no_urls = list(itertools.chain(*words_in_happy_tweet))
sad_words_no_urls = list(itertools.chain(*words_in_sad_tweet))
angry_words_no_urls = list(itertools.chain(*words_in_angry_tweet))
shame_words_no_urls = list(itertools.chain(*words_in_shame_tweet))
fear_words_no_urls = list(itertools.chain(*words_in_fear_tweet))
counts_no_happy = collections.Counter(happy_words_no_urls)
counts_no_sad = collections.Counter(sad_words_no_urls)
counts_no_angry = collections.Counter(angry_words_no_urls)
counts_no_shame = collections.Counter(shame_words_no_urls)
counts_no_fear = collections.Counter(fear_words_no_urls)
clean_tweets_no_happy = pd.DataFrame(counts_no_happy.most_common(15),
columns=['words', 'count'])
clean_tweets_no_sad = pd.DataFrame(counts_no_sad.most_common(15),
columns=['words', 'count'])
clean_tweets_no_angry = pd.DataFrame(counts_no_angry.most_common(15),
columns=['words', 'count'])
clean_tweets_no_shame = pd.DataFrame(counts_no_shame.most_common(15),
columns=['words', 'count'])
clean_tweets_no_fear = pd.DataFrame(counts_no_fear.most_common(15),
columns=['words', 'count'])
pt=clean_tweets_no_happy.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Happy Words')
plt.show()
pt=clean_tweets_no_sad.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Sad Words')
plt.show()
pt=clean_tweets_no_angry.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Angry Words')
plt.show()
pt=clean_tweets_no_shame.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Shame Emotion Words')
plt.show()
pt=clean_tweets_no_fear.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Fear Emotion Words')
plt.show()
from palettable.colorbrewer.qualitative import Pastel1_7
tweet=df.text[df.location=='India']
words_in_happy_tweet = [tweet.lower().split() for tweet in df_happy.text]
words_in_sad_tweet = [tweet.lower().split() for tweet in df_sad.text]
words_in_angry_tweet = [tweet.lower().split() for tweet in df_angry.text]
words_in_shame_tweet = [tweet.lower().split() for tweet in df_shame.text]
words_in_fear_tweet = [tweet.lower().split() for tweet in df_fear.text]
happy_words_no_urls = list(itertools.chain(*words_in_happy_tweet))
sad_words_no_urls = list(itertools.chain(*words_in_sad_tweet))
angry_words_no_urls = list(itertools.chain(*words_in_angry_tweet))
shame_words_no_urls = list(itertools.chain(*words_in_shame_tweet))
fear_words_no_urls = list(itertools.chain(*words_in_fear_tweet))
counts_no_happy = collections.Counter(happy_words_no_urls)
counts_no_sad = collections.Counter(sad_words_no_urls)
counts_no_angry = collections.Counter(angry_words_no_urls)
counts_no_shame = collections.Counter(shame_words_no_urls)
counts_no_fear = collections.Counter(fear_words_no_urls)
clean_tweets_no_happy = pd.DataFrame(counts_no_happy.most_common(15),
columns=['words', 'count'])
clean_tweets_no_sad = pd.DataFrame(counts_no_sad.most_common(15),
columns=['words', 'count'])
clean_tweets_no_angry = pd.DataFrame(counts_no_angry.most_common(15),
columns=['words', 'count'])
clean_tweets_no_shame = pd.DataFrame(counts_no_shame.most_common(15),
columns=['words', 'count'])
clean_tweets_no_fear = pd.DataFrame(counts_no_fear.most_common(15),
columns=['words', 'count'])
pt=clean_tweets_no_happy.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Happy Words')
plt.show()
pt=clean_tweets_no_sad.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Sad Words')
plt.show()
pt=clean_tweets_no_angry.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Angry Words')
plt.show()
pt=clean_tweets_no_shame.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Shame Emotion Words')
plt.show()
pt=clean_tweets_no_fear.sort_values(by='count')
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.rcParams['text.color'] = 'black'
plt.pie(pt['count'], labels=pt.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Fear Emotion Words')
plt.show()
df_happy=df[df['emotion']=='Happy'].groupby(["date"],as_index = False).count()
df_sad=df[df['emotion']=='Sad'].groupby(["date"],as_index = False).count()
df_angry=df[df['emotion']=='Angry'].groupby(["date"],as_index = False).count()
df_shame=df[df['emotion']=='Shame'].groupby(["date"],as_index = False).count()
df_fear=df[df['emotion']=='Fear'].groupby(["date"],as_index = False).count()
plt.subplots(1, figsize=(10, 8))
plt.plot(df_happy["date"],df_happy['text'],color="yellow")
plt.plot(df_sad["date"],df_sad["text"],color="blue")
plt.plot(df_angry["date"],df_angry["text"],color="red")
plt.plot(df_shame["date"],df_shame["text"],color="grey")
plt.plot(df_fear["date"],df_fear["text"],color="purple")
plt.legend(["Happy", "Sad","Angry","Shame","Fear"])
plt.title("Emotion Sentiment Analysis DateWise")
plt.xlabel("Dates")
plt.ylabel("Frequency")
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
fig, ax = plt.subplots(1, figsize=(10, 8))
ax.scatter(df_happy["date"],df_happy['text'],color="yellow", s=df_happy['text'], alpha=0.5)
ax.scatter(df_sad["date"],df_sad["text"],color="blue", s=df_sad["text"], alpha=0.5)
ax.scatter(df_angry["date"],df_angry["text"],color="red", s=df_angry["text"], alpha=0.5)
ax.scatter(df_shame["date"],df_shame["text"],color="grey", s=df_shame["text"], alpha=0.5)
ax.scatter(df_fear["date"],df_fear["text"],color="purple", s=df_fear["text"], alpha=0.5)
ax.set_xlabel('Weeks', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Emotion Sentiment change Datewise')
ax.grid(True)
fig.tight_layout()
plt.show()
df = df[df['location']=='India']
df_happy=df[df['emotion']=='Happy'].groupby(["date"],as_index = False).count()
df_sad=df[df['emotion']=='Sad'].groupby(["date"],as_index = False).count()
df_angry=df[df['emotion']=='Angry'].groupby(["date"],as_index = False).count()
df_shame=df[df['emotion']=='Shame'].groupby(["date"],as_index = False).count()
df_fear=df[df['emotion']=='Fear'].groupby(["date"],as_index = False).count()
plt.subplots(1, figsize=(10, 8))
plt.plot(df_happy["date"],df_happy['text'],color="yellow")
plt.plot(df_sad["date"],df_sad["text"],color="blue")
plt.plot(df_angry["date"],df_angry["text"],color="red")
plt.plot(df_shame["date"],df_shame["text"],color="grey")
plt.plot(df_fear["date"],df_fear["text"],color="purple")
plt.legend(["Happy", "Sad","Angry","Shame","Fear"])
plt.title("Emotion Sentiment Analysis DateWise")
plt.xlabel("Dates")
plt.ylabel("Frequency")
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
fig, ax = plt.subplots(1, figsize=(10, 8))
ax.scatter(df_happy["date"],df_happy['text'],color="yellow", s=df_happy['text'], alpha=0.5)
ax.scatter(df_sad["date"],df_sad["text"],color="blue", s=df_sad["text"], alpha=0.5)
ax.scatter(df_angry["date"],df_angry["text"],color="red", s=df_angry["text"], alpha=0.5)
ax.scatter(df_shame["date"],df_shame["text"],color="grey", s=df_shame["text"], alpha=0.5)
ax.scatter(df_fear["date"],df_fear["text"],color="purple", s=df_fear["text"], alpha=0.5)
ax.set_xlabel('Weeks', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Sentiment change weekwise')
ax.grid(True)
fig.tight_layout()
plt.show()
all_tweet_location
df_happy=all_tweet_location[all_tweet_location['emotion']=='Happy'].groupby(["date"],as_index = False).count()
df_sad=all_tweet_location[all_tweet_location['emotion']=='Sad'].groupby(["date"],as_index = False).count()
df_angry=all_tweet_location[all_tweet_location['emotion']=='Angry'].groupby(["date"],as_index = False).count()
df_shame=all_tweet_location[all_tweet_location['emotion']=='Shame'].groupby(["date"],as_index = False).count()
df_fear=all_tweet_location[all_tweet_location['emotion']=='Fear'].groupby(["date"],as_index = False).count()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_happy['count'],
mode='lines+markers',
name='Sad'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_angry['count'],
mode='lines+markers',
name='Angry'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_happy['count'],
mode='lines+markers',
name='Happy'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_shame['count'],
mode='lines+markers',
name='Shame'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_fear['count'],
mode='lines+markers',
name='Fear'))
fig.update_layout(
title_text='Different Emotion Tweets per Day Worldwide : ({} - {})'.format(df_happy['date'].min(),
df_happy['date'].max()),template="plotly_dark",
title_x=0.5)
fig.show()
all_tweet_location=all_tweet_location[all_tweet_location['location']=='India']
df_happy=all_tweet_location[all_tweet_location['emotion']=='Happy'].groupby(["date"],as_index = False).count()
df_sad=all_tweet_location[all_tweet_location['emotion']=='Sad'].groupby(["date"],as_index = False).count()
df_angry=all_tweet_location[all_tweet_location['emotion']=='Angry'].groupby(["date"],as_index = False).count()
df_shame=all_tweet_location[all_tweet_location['emotion']=='Shame'].groupby(["date"],as_index = False).count()
df_fear=all_tweet_location[all_tweet_location['emotion']=='Fear'].groupby(["date"],as_index = False).count()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_happy['count'],
mode='lines+markers',
name='Sad'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_angry['count'],
mode='lines+markers',
name='Angry'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_happy['count'],
mode='lines+markers',
name='Happy'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_shame['count'],
mode='lines+markers',
name='Shame'))
fig.add_trace(go.Scatter(x=df_happy['date'],
y=df_fear['count'],
mode='lines+markers',
name='Fear'))
fig.update_layout(
title_text='Different Emotion Tweets per Day Wordlwide : ({} - {})'.format(df_happy['date'].min(),
df_happy['date'].max()),template="plotly_dark",
title_x=0.5)
fig.show()
Sentiment analysis of the tweets determine the emotions of various human behaviors and inclination of vast population towards specific topic, item or entity. Sentiment Analysis or Opinion Mining is the computational treatment of opinions, sentiments and subjectivity of text. Sentiment analysis, a branch of digital analytics aims to determine the attitude of a speaker or a writer with respect to some topic or the overall contextual polarity of a document.
We have used the lstm for the sentimental analysis and for prediction of the sentiments of the tweets. We have achieved the accuracy of 94%. After doing analysis we have found that approx. 31.9% are in fear, 34.6 % people are happy , 17% are sad and 12.9% are angry and 3.68 % are shame worldwide and 28.5% in fear, 39% are happy, 17.4% are sad and11.5% are Angry and 3.6% are shame in india. So, we can thus conclude that the people are doing negative tweets due to covid in India. and peoples of india as well as world are facing mental issue or they are thinking negatively and posting negative tweets over the tweeter.